Scrape list of sub division (Neighborhoods) in a city¶

In [1]:
# Using beautiful soup to scrape data
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

Create a function to get the latitude and longitude of a place based on address¶

In [2]:
# use geocoder library, if not present use !conda install -c conda-forge geocoder
import geocoder
# Google API key is required for the geocoder library to work, save the API key in OS environment variables as GOOGLE_API_KEY
# and then access thay key here
import os
# Use BING_API_KEY when choosing to use bing geocoding instead of google geocoding.
BING_API_KEY = 'AksNN-3luSfNBssyZ3Ju4i78nIrFLt1UtYo--YWQj9oyfxSwyXkdsqykWk3FeTXB' # os.environ['BING_API_KEY']
In [3]:
# This function will take an adress and return the latlng of that adress
def get_latlng(address):
    # using bing geocoder API since it is better.
    g = geocoder.bing(address, key = BING_API_KEY)
    return pd.Series(g.latlng)

Create a function to return soup object from a url¶

In [4]:
# Function returns a soup object on the basis of URL
def get_soup_object(url):
    source_data = requests.get(url).text
    return BeautifulSoup(source_data,'lxml')

Ranchi¶

Step 1: initiliaze all the url and get the soup object from the website¶

In [ ]:
# initialize url
rnc_data_url = 'http://vlist.in/district/364.html'
# use function to get soup object
soup = get_soup_object(rnc_data_url)
print('Soup object created')
In [ ]:
village_url_header = 'http://vlist.in'
district_name = 'Ranchi'

Step 2: Extract the rows from the website data¶

In [ ]:
# function extracts row from the table from government website. This will return the name in the table and the link associated with the name
def extract_row(table_row):
    table_row = table_row.find_all('td')
    
    index = table_row[0].text
    
    link = village_url_header + table_row[1].find('a')['href']
    
    name = table_row[1].text
    
    return link, name
In [ ]:
# extracting the block rows
table_rows = soup.find_all('tr')
table_rows = table_rows[1:]
table_rows = table_rows[1:]
data = []
# for every block row all the villages will also be extracted
for table_row in table_rows:
    
    sub_district_link, block_name = extract_row(table_row)
    print(block_name)
    # getting the sub villages in block
    soup_village = get_soup_object(sub_district_link)
    # get all the table rows for individual villages in block
    sub_table_rows = soup_village.find_all('tr')
    sub_table_rows = sub_table_rows[1:]
    
    # extract individual village name and store it in data along with block name and district name
    for sub_table_row in sub_table_rows:
    
        sub_link, village_name = extract_row(sub_table_row)
        
        data.append([village_name, block_name, district_name])

print(data[0])

Step 3: Store data in dataframe¶

In [ ]:
# save data in csv for future usage
header = ['Village','Block','District']
df = pd.DataFrame(data= data, columns= header)
In [ ]:
df.head()

Step 4: Get latitude and longitude for subdivisions¶

In [ ]:
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Village +', '+ x.Block + ', ' + x.District), axis=1)
df.head()
In [ ]:
df.info()
In [ ]:
df.dropna(inplace= True)
df.info()

Step 5: Store the data in a csv¶

In [ ]:
# data will be used later
df.to_csv('ranchi_villages.csv')

Now Repeating all the above steps for Delhi, Mumbai, Kolkata and Chennai¶

Delhi¶

In [78]:
delhi_data_url = 'https://en.wikipedia.org/wiki/Neighbourhoods_of_Delhi'
# initialize soup object
soup = get_soup_object(delhi_data_url)
print('soup object created')
soup object created
In [83]:
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:10]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Delhi'])
# print the number of neighborhood obtained
print(len(row_items))
185
In [84]:
# create a data frame
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.tail()
Out[84]:
Neighborhood City
180 Tihar Village Delhi
181 Tilak Nagar Delhi
182 Uttam Nagar Delhi
183 Vikas Nagar Delhi
184 Vikaspuri Delhi
In [85]:
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
Out[85]:
Neighborhood City Latitude Longitude
0 Adarsh Nagar Delhi 28.720341 77.172661
1 Ashok Vihar Delhi 28.690420 77.176064
2 Azadpur Delhi 28.712420 77.173111
3 Bawana Delhi 28.797661 77.045258
4 Begum Pur Delhi 28.732599 77.052170
In [86]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 4 columns):
Neighborhood    185 non-null object
City            185 non-null object
Latitude        185 non-null float64
Longitude       185 non-null float64
dtypes: float64(2), object(2)
memory usage: 5.9+ KB
In [87]:
df.to_csv('delhi_subdiv.csv')

Chennai¶

In [38]:
# Initialize the url
chennai_data_url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_of_Chennai'
# initialize soup object
soup = get_soup_object(chennai_data_url)
print('soup object created')
soup object created
In [45]:
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:8]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Chennai'])
# print the number of neighborhood obtained
print(len(row_items))
181
In [46]:
# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()
Out[46]:
Neighborhood City
0 Red Hills Chennai
1 Ayanavaram Chennai
2 Royapuram Chennai
3 Korukkupet Chennai
4 Vyasarpadi Chennai
In [47]:
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
Out[47]:
Neighborhood City Latitude Longitude
0 Red Hills Chennai 13.19543 80.184303
1 Ayanavaram Chennai 13.09883 80.232384
2 Royapuram Chennai 13.11396 80.294220
3 Korukkupet Chennai 13.11680 80.277298
4 Vyasarpadi Chennai 13.11778 80.251678
In [48]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 4 columns):
Neighborhood    181 non-null object
City            181 non-null object
Latitude        181 non-null float64
Longitude       181 non-null float64
dtypes: float64(2), object(2)
memory usage: 5.7+ KB
In [49]:
df.to_csv('chennai_subdiv.csv')

Kolkata¶

In [59]:
# Initialize the url
kolkata_data_url = 'https://en.wikipedia.org/wiki/Neighbourhoods_in_Kolkata_Metropolitan_Area'
# initialize soup object
soup = get_soup_object(kolkata_data_url)
print('soup object created')
soup object created
In [71]:
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:7]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Kolkata'])
        
print(len(row_items))
43
In [72]:
# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()
Out[72]:
Neighborhood City
0 Kalyani Municipality Kolkata
1 Gayespur Municipality Kolkata
2 Kanchrapara Municipality Kolkata
3 Halisahar Municipality Kolkata
4 Naihati Municipality Kolkata
In [73]:
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
Out[73]:
Neighborhood City Latitude Longitude
0 Kalyani Municipality Kolkata 22.570539 88.371239
1 Gayespur Municipality Kolkata 22.570539 88.371239
2 Kanchrapara Municipality Kolkata 22.951059 88.431023
3 Halisahar Municipality Kolkata 22.570539 88.371239
4 Naihati Municipality Kolkata 22.895760 88.428757
In [74]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 4 columns):
Neighborhood    43 non-null object
City            43 non-null object
Latitude        43 non-null float64
Longitude       43 non-null float64
dtypes: float64(2), object(2)
memory usage: 1.4+ KB
In [75]:
df.to_csv('kolkata_subdiv.csv')
In [116]:
df = pd.read_csv('kolkata_subdiv.csv',index_col = 0)
In [118]:
df.head()
Out[118]:
Neighborhood City Latitude Longitude
0 Kalyani Municipality Kolkata 22.570539 88.371239
1 Gayespur Municipality Kolkata 22.570539 88.371239
2 Kanchrapara Municipality Kolkata 22.951059 88.431023
3 Halisahar Municipality Kolkata 22.570539 88.371239
4 Naihati Municipality Kolkata 22.895760 88.428757

Mumbai¶

In [90]:
# Initialize the url
mumbai_data_url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Mumbai'
# initialize soup object
soup = get_soup_object(mumbai_data_url)
print('soup object created')
soup object created
In [102]:
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[5:36]
row_items = []
for row_group in row_groups:
    rows = row_group.find_all('li')
    for row in rows:
        row_items.append([row.text,'Mumbai'])
        
print(len(row_items))
122
In [103]:
# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()
Out[103]:
Neighborhood City
0 Amboli Mumbai
1 Chakala Mumbai
2 D.N. Nagar Mumbai
3 Four Bungalows Mumbai
4 JB Nagar Mumbai
In [104]:
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
Out[104]:
Neighborhood City Latitude Longitude
0 Amboli Mumbai 19.129061 72.846451
1 Chakala Mumbai 19.108360 72.862343
2 D.N. Nagar Mumbai 19.124084 72.831375
3 Four Bungalows Mumbai 19.126301 72.824318
4 JB Nagar Mumbai 19.105770 72.864098
In [105]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 4 columns):
Neighborhood    122 non-null object
City            122 non-null object
Latitude        122 non-null float64
Longitude       122 non-null float64
dtypes: float64(2), object(2)
memory usage: 3.9+ KB
In [106]:
df.to_csv('mumbai_subdiv.csv')

Visualize the locations of a city on a map.¶

In [52]:
#!conda install -c conda-forge folium --yes # uncomment this line if folium is missing
import folium
In [111]:
# Function takes in a data frame with Latitude, Longitude, Neighborhood and City columns and shows it on map
def visualize_area_in_map(data):
    # add markers to map
    for lat, lng, neighborhood, city in zip(data['Latitude'], data['Longitude'], data['Neighborhood'], data['City']):
        label = '{}, {}'.format(neighborhood, city)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=2,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map)  
    
    return map

Visualize Mumbai's Neighborhoods¶

In [112]:
city = 'Mumbai'
latitude, longitude = get_latlng(city)
print('Lat : ',latitude,' Long : ',longitude)
Lat :  18.940170288085938  Long :  72.8348617553711
In [113]:
# create map of Toronto using latitude and longitude values
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# data to be used for map
data = df.dropna()

visualize_area_in_map(data)
Out[113]:
In [ ]: